Executive summary

This file includes the code for computing the regression results regarding the predictors of the evolution of self-related health during the FinDonor study for the

Load the data

We load the data and assign groups as they were assigned in the first part of the study.

#Load questionnaire data
load(file = "../data/final_data.rdata") 
final_data <- data

#load measurement data
load(file = "../data/quest_2_donor_data.rdata")

FD_activity_biomarker_data <- data %>% 
  rename(Ferritin_beginning = Ferritin)

first_questionnaire_data <- final_data %>% filter(questionnaire == "First")
second_questionnaire_data <- final_data %>% filter(questionnaire == "Second")
rm(data)

Preprocessing

We remove donors with very high ferritin (/Ferritin > 400) and high CRP (> 30)

We compute for each donor their wight at first donation and then the difference in reported BMI between the first questionnaire and the last questionnaire.

We then build a categorical variable for smoking status. This variable will need to be dummy coded into 3 variables if it is used as a regression variable.

We will apply this coding as it was applied in a recent epidemiological study in the Japanese population. Importantly we will consider only changes from smoker to non-smoker, and not changes fron “daily” to “occasional” smoker, as was done in the previously cited paper(NOTE: they had much more precise quantification of smoking behavior (see questionnaire ) and a much larger N and still used a binary smoking/non-smoking catergorisation). However, in the modelling stage we find that smoking can not be used in the models.

We also add age to the dataframe.

The data object FD_activity_biomarker_data includes all necessary variables for implenting the model.

FD_activity_biomarker_data <-
  final_data %>% 
    group_by(donor) %>%
  # Get staring weight
    filter(questionnaire =="First") %>% 
    rename(starting_weight = weight) %>% 
    dplyr::select(donor, starting_weight) %>% 
    full_join(final_data, by = "donor") %>% 
  # Get difference in BMI
    dplyr::select(donor, starting_weight, BMI, questionnaire) %>% 
    spread(key = questionnaire, value = BMI ) %>% 
    mutate(BMI_diff = Second - First) %>% 
    dplyr::select(-First, -Second) %>% 
    full_join(final_data, by = "donor") %>% 
  # GEt smoking status evolution
    dplyr::select(donor, starting_weight, BMI_diff, smoking, questionnaire) %>% 
    spread(key = questionnaire, value = smoking) %>% 
    mutate(smoking_behavior = case_when(First == "no" & Second == "no" ~"never smoker",
                                        First == "no" &  Second %in% c("daily", "sometimes") ~ "new smoker",
                                        First %in% c("daily", "sometimes") & Second %in% c("daily", "sometimes") ~ "always smoker",
                                        First %in% c("daily", "sometimes") & Second == "no" ~"Former smoker",
                                        TRUE ~"NA")) %>%  
    dplyr::select(-First, -Second) %>% 
    full_join(final_data, by = "donor") %>%
    dplyr::select(donor, starting_weight, BMI_diff, health, smoking_behavior, questionnaire) %>% 
  # Get health status evolution
     spread(key = questionnaire, value = health) %>% 
     mutate(health_evolution = case_when(First < Second ~ "Improved",
                                  First == Second ~ "Stable",
                                  Second < First ~ "Worsened",
                                  TRUE ~"NA")) %>% 
    dplyr::select(-First, -Second) %>% 
  # Add number of months between donation events  
    full_join(final_data, by = "donor") %>%
    dplyr::select(donor, starting_weight, BMI_diff, health_evolution, smoking_behavior, date, questionnaire,group) %>% 
    spread(key = questionnaire, value = date) %>% 
  mutate(interval_between_quest = interval(First, Second) %>% 
                                    as.duration() %>% 
           as.numeric( "months") %>% 
           round(0)) %>% 
    dplyr::select(-First, -Second) %>% 
  # Add donor age
    left_join(first_questionnaire_data %>%
              dplyr::select(donor, age),
            by = "donor") %>%
      filter( group != "Women_pre_menop_no_mens") %>% 
  # Exckude donors with no measure of health evolution
  left_join(FD_activity_biomarker_data, by = "donor") %>%
  filter(health_evolution != "NA") %>% 
  drop_na(starting_weight,BMI_diff ) %>% 
  filter(Ferritin_beginning <= 400 & CRP < 30)

final_data <-
  final_data %>% 
    filter(donor %in% FD_activity_biomarker_data$donor)

We first count how many donors we have in each group.

final_data %>% 
  filter(questionnaire == "First") %>% 
   count(group)
## # A tibble: 3 x 2
##   group                    n
##   <fct>                <int>
## 1 Men                    589
## 2 Post_menopause_women   353
## 3 Pre_menopause_women    474

Regressions

Regression data

We select the variabels that will be included in the models.

We also run some trasnformations to make the OR more interpretable:

  • Ferritin is transformed to \[log(Ferritin)/log(2)\] so that a unit increase in the transformed variable corresponds to a doubling of ferritin
  • weight, age and Hb were divided by 5 so that a unit increase in the transformed varaible corresponds to an increase in 5 kg or 5 years or 5 Hb points.
regression_data <-
  FD_activity_biomarker_data %>% 
  inner_join(first_questionnaire_data %>% 
               dplyr::select(donor, health),
             by = "donor") %>% 
  mutate(health_very_good = ifelse(health == "Very_good",1,0),
         health_excellent = ifelse(health == "Excellent", 1, 0),
         health_poor = ifelse(health == "Poor", 1, 0),
         health_satisfactory = ifelse(health == "Satisfactory", 1, 0)
         ) %>% 
  mutate(health_outcome = ifelse(health_evolution == "Worsened", 1,0)) %>%
  mutate(health_outcome_neg = ifelse(health_evolution == "Improved", 1,0)) %>%
  mutate(donation_frequency = nb_donations_between_questionnaires/(interval_between_quest/12) ) %>% 
  mutate(log_Ferritin_beginning = log(Ferritin_beginning)/log(2),
         starting_weight_o =starting_weight, 
         starting_weight = starting_weight/5,
         age_o = age,
         age = age/5,
         Hb_beginning_o = Hb_v,
         Hb_beginning = Hb_v/5) %>% 
  dplyr::select(donor, starting_weight,BMI_diff, age, group, log_Ferritin_beginning, Hb_beginning,
                donation_frequency,health_poor,health_satisfactory, health_very_good,health_excellent,health_outcome,health_outcome_neg, smoking_behavior,Ferritin_beginning,age_o,starting_weight_o,Hb_beginning_o )

  
regression_data$smoking_behavior <- factor(regression_data$smoking_behavior)
regression_data <- droplevels(regression_data)
file <- "../results/hypregdata_unscaled.rdata"
save(regression_data,file=file)
regression_data %>% group_by(group) %>% 
     count()
## # A tibble: 3 x 2
## # Groups:   group [3]
##   group                    n
##   <fct>                <int>
## 1 Men                    589
## 2 Post_menopause_women   353
## 3 Pre_menopause_women    474

Table one

# myVars <- c( "age_o", "starting_weight_o" ,"BMI_diff", "Hb_beginning_o", "Ferritin_beginning",
#              "donation_frequency","initial_health","final_health")

myVars <- c(   "Age" ,
  "Initial weight (kg)" ,
  "BMI difference (second - first)" ,
  "Initial hemoglobin (g/l)" ,
  "Initial ferritin (ug/l)" ,
  "Donation frequency (yearly)" ,
  "Inital health rating",
  "Final health rating")


non_normal_vars <- c("Initial ferritin (ug/l)")

health1<- first_questionnaire_data %>% dplyr::select(donor,health) %>% rename(initial_health=health)
health2<- second_questionnaire_data %>% dplyr::select(donor,health) %>% rename(final_health=health)

table1data <- regression_data %>% left_join(health1) %>% left_join(health2) 
table1data <- table1data %>% dplyr::select(group,age_o,starting_weight_o,BMI_diff,Hb_beginning_o,Ferritin_beginning,donation_frequency,initial_health,final_health) %>%
  rename(
  "Age" = age_o,
  "Initial weight (kg)" = starting_weight_o ,
  "BMI difference (second - first)" = BMI_diff,
  "Initial hemoglobin (g/l)" = Hb_beginning_o,
  "Initial ferritin (ug/l)" = Ferritin_beginning,
  "Donation frequency (yearly)" =donation_frequency,
  "Inital health rating"=initial_health,
  "Final health rating"= final_health
)


summary_table <- CreateTableOne(data = table1data,vars=myVars, strata = "group",test = FALSE)
  
tab3Mat <- print(summary_table, nonnormal = non_normal_vars,vars=myVars, quote = FALSE, noSpaces = TRUE, printToggle = FALSE)


tab3Mat %>% 
  kable()
Men Post_menopause_women Pre_menopause_women
n 589 353 474
Age (mean (SD)) 47.65 (13.20) 58.58 (5.57) 35.51 (10.04)
Initial weight (kg) (mean (SD)) 85.23 (14.38) 71.09 (12.86) 70.74 (14.06)
BMI difference (second - first) (mean (SD)) 0.23 (1.39) 0.28 (1.53) 0.90 (1.91)
Initial hemoglobin (g/l) (mean (SD)) 150.38 (9.64) 138.07 (8.03) 135.28 (7.80)
Initial ferritin (ug/l) (median [IQR]) 41.00 [25.00, 68.00] 33.00 [21.00, 51.00] 25.00 [16.00, 41.00]
Donation frequency (yearly) (mean (SD)) 3.00 (1.40) 2.41 (0.94) 1.92 (0.96)
Inital health rating (%)
Poor 0 (0.0) 0 (0.0) 0 (0.0)
Satisfactory 15 (2.5) 6 (1.7) 9 (1.9)
Good 175 (29.7) 142 (40.2) 156 (32.9)
Very_good 294 (49.9) 146 (41.4) 236 (49.8)
Excellent 105 (17.8) 59 (16.7) 73 (15.4)
Final health rating (%)
Poor 2 (0.3) 1 (0.3) 2 (0.4)
Satisfactory 36 (6.1) 30 (8.5) 28 (5.9)
Good 213 (36.2) 136 (38.5) 195 (41.1)
Very_good 256 (43.5) 149 (42.2) 192 (40.5)
Excellent 82 (13.9) 37 (10.5) 57 (12.0)
  write.csv(tab3Mat, file = paste0("../results/table_1_population_",gsub("-","",as.Date(Sys.time())),".csv"))

Pre-menopausal women

regression_data_pre_menop_women <-
  regression_data %>% 
  filter(group == "Pre_menopause_women") %>% 
  dplyr::select(donor, age, starting_weight, BMI_diff,
                log_Ferritin_beginning, Hb_beginning, donation_frequency) %>% 
  gather(key = key, value = value, -donor) %>% 
  group_by(key) %>% 
  mutate(value_scale = scale(value, scale = FALSE)[,1]) %>% 
  dplyr::select(-value) %>% 
  spread(key = key, value = value_scale) %>% 
  inner_join(regression_data %>% 
               dplyr::select(donor,health_poor,health_satisfactory, health_very_good, health_excellent, health_outcome,health_outcome_neg,smoking_behavior),
             by = "donor")

Correlograms

logit_pre_menop_women <- glm(health_outcome ~ age + starting_weight + BMI_diff + 
                               log_Ferritin_beginning + Hb_beginning +
                              donation_frequency + health_very_good + health_excellent,
                             data=regression_data_pre_menop_women,
                            family="binomial")
 
 summary(logit_pre_menop_women)
## 
## Call:
## glm(formula = health_outcome ~ age + starting_weight + BMI_diff + 
##     log_Ferritin_beginning + Hb_beginning + donation_frequency + 
##     health_very_good + health_excellent, family = "binomial", 
##     data = regression_data_pre_menop_women)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6542  -0.9426  -0.4128   1.0571   2.5083  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.37444    0.27418  -8.660  < 2e-16 ***
## age                     0.05216    0.05786   0.902   0.3673    
## starting_weight         0.12963    0.04129   3.139   0.0017 ** 
## BMI_diff                0.11202    0.05750   1.948   0.0514 .  
## log_Ferritin_beginning -0.03595    0.11678  -0.308   0.7582    
## Hb_beginning            0.00524    0.07461   0.070   0.9440    
## donation_frequency     -0.14188    0.11567  -1.227   0.2200    
## health_very_good        2.05066    0.30789   6.660 2.73e-11 ***
## health_excellent        2.92329    0.37726   7.749 9.28e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 604.79  on 473  degrees of freedom
## Residual deviance: 509.53  on 465  degrees of freedom
## AIC: 527.53
## 
## Number of Fisher Scoring iterations: 5
plot(logit_pre_menop_women)

vif(logit_pre_menop_women)
##                    age        starting_weight               BMI_diff 
##               1.160649               1.187453               1.047510 
## log_Ferritin_beginning           Hb_beginning     donation_frequency 
##               1.212132               1.135600               1.043167 
##       health_very_good       health_excellent 
##               1.900105               1.941321
tidy_logit_pre_menop_women <-
  logit_pre_menop_women %>% 
  tidy %>% 
  mutate(OR = exp(estimate),
         group = "Pre_menopause_women")

Post-menopausal women

regression_data_post_menop_women <-
  regression_data %>% 
  filter(group == "Post_menopause_women") %>% 
  dplyr::select(donor, age, starting_weight, BMI_diff,
                log_Ferritin_beginning,Hb_beginning,donation_frequency) %>% 
  gather(key = key, value = value, -donor) %>% 
  group_by(key) %>% 
  mutate(value_scale = scale(value, scale = FALSE)[,1]) %>% 
  dplyr::select(-value) %>% 
  spread(key = key, value = value_scale) %>% 
  inner_join(regression_data %>% 
               dplyr::select(donor, health_poor,health_satisfactory,health_very_good, health_excellent, health_outcome,health_outcome_neg,smoking_behavior),
             by = "donor")

Correlograms

Models and results

logit_post_menop_women <- glm(health_outcome ~ age + starting_weight + BMI_diff + 
                                log_Ferritin_beginning + Hb_beginning + donation_frequency +
             health_very_good + health_excellent,
             data=regression_data_post_menop_women,
             family="binomial")
 
 summary(logit_post_menop_women)
## 
## Call:
## glm(formula = health_outcome ~ age + starting_weight + BMI_diff + 
##     log_Ferritin_beginning + Hb_beginning + donation_frequency + 
##     health_very_good + health_excellent, family = "binomial", 
##     data = regression_data_post_menop_women)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6885  -0.7666  -0.5016   0.8962   2.3779  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.128759   0.268687  -7.923 2.32e-15 ***
## age                     0.001707   0.122117   0.014  0.98885    
## starting_weight         0.132685   0.055185   2.404  0.01620 *  
## BMI_diff                0.261035   0.089226   2.926  0.00344 ** 
## log_Ferritin_beginning  0.001927   0.152094   0.013  0.98989    
## Hb_beginning           -0.009034   0.088292  -0.102  0.91851    
## donation_frequency     -0.387220   0.144547  -2.679  0.00739 ** 
## health_very_good        1.331872   0.325173   4.096 4.21e-05 ***
## health_excellent        2.557690   0.407650   6.274 3.51e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.03  on 352  degrees of freedom
## Residual deviance: 357.36  on 344  degrees of freedom
## AIC: 375.36
## 
## Number of Fisher Scoring iterations: 4
tidy_logit_post_menop_women <-
  logit_post_menop_women %>% 
  tidy %>% 
  mutate(OR = exp(estimate),
         group = "Post_menopause_women")

Diagnostics

We check that there is no problematic collinearity betwen regressors (VIF < 5)

vif(logit_post_menop_women)
##                    age        starting_weight               BMI_diff 
##               1.108134               1.157356               1.139544 
## log_Ferritin_beginning           Hb_beginning     donation_frequency 
##               1.130102               1.159486               1.059071 
##       health_very_good       health_excellent 
##               1.557766               1.729163
plot(logit_post_menop_women)

Men

regression_data_men <-
  regression_data %>% 
  filter(group == "Men") %>% 
  dplyr::select(donor, age, starting_weight, BMI_diff,
                log_Ferritin_beginning, Hb_beginning, 
                donation_frequency) %>% 
  gather(key = key, value = value, -donor) %>% 
  group_by(key) %>% 
  mutate(value_scale = scale(value, scale = FALSE)[,1]) %>% 
  dplyr::select(-value) %>% 
  spread(key = key, value = value_scale) %>% 
  inner_join(regression_data %>% 
               dplyr::select(donor, health_poor,health_satisfactory, health_very_good, health_excellent,health_outcome,health_outcome_neg,smoking_behavior),
             by = "donor")

Correlograms

Models and results

logit_men <- glm(health_outcome ~ age + starting_weight + BMI_diff + 
                   log_Ferritin_beginning + Hb_beginning + donation_frequency +
             health_very_good + health_excellent,
             data=regression_data_men,
             family="binomial")
 
 summary(logit_men)
## 
## Call:
## glm(formula = health_outcome ~ age + starting_weight + BMI_diff + 
##     log_Ferritin_beginning + Hb_beginning + donation_frequency + 
##     health_very_good + health_excellent, family = "binomial", 
##     data = regression_data_men)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5230  -0.8189  -0.4643   0.9546   2.5087  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.53503    0.26804  -9.458  < 2e-16 ***
## age                     0.02350    0.04026   0.584  0.55941    
## starting_weight         0.17939    0.03660   4.902 9.48e-07 ***
## BMI_diff                0.10428    0.07410   1.407  0.15933    
## log_Ferritin_beginning -0.28867    0.11635  -2.481  0.01310 *  
## Hb_beginning            0.07268    0.05482   1.326  0.18495    
## donation_frequency     -0.20859    0.07955  -2.622  0.00874 ** 
## health_very_good        1.79474    0.29514   6.081 1.19e-09 ***
## health_excellent        2.72150    0.34306   7.933 2.14e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 706.07  on 588  degrees of freedom
## Residual deviance: 598.39  on 580  degrees of freedom
## AIC: 616.39
## 
## Number of Fisher Scoring iterations: 5
library(epiDisplay)
logistic.display(logit_men)
## 
## Logistic regression predicting health_outcome 
##  
##                                     crude OR(95%CI)   adj. OR(95%CI)    
## age (cont. var.)                    0.97 (0.9,1.03)   1.02 (0.95,1.11)  
##                                                                         
## starting_weight (cont. var.)        1.13 (1.06,1.2)   1.2 (1.11,1.29)   
##                                                                         
## BMI_diff (cont. var.)               1.09 (0.96,1.24)  1.11 (0.96,1.28)  
##                                                                         
## log_Ferritin_beginning (cont. var.) 0.93 (0.78,1.12)  0.75 (0.6,0.94)   
##                                                                         
## Hb_beginning (cont. var.)           1.06 (0.97,1.16)  1.08 (0.97,1.2)   
##                                                                         
## donation_frequency (cont. var.)     0.91 (0.8,1.04)   0.81 (0.69,0.95)  
##                                                                         
## health_very_good: 1 vs 0            1.58 (1.1,2.26)   6.02 (3.37,10.73) 
##                                                                         
## health_excellent: 1 vs 0            3.4 (2.2,5.26)    15.2 (7.76,29.78) 
##                                                                         
##                                     P(Wald's test) P(LR-test)
## age (cont. var.)                    0.559          0.559     
##                                                              
## starting_weight (cont. var.)        < 0.001        < 0.001   
##                                                              
## BMI_diff (cont. var.)               0.159          0.158     
##                                                              
## log_Ferritin_beginning (cont. var.) 0.013          0.012     
##                                                              
## Hb_beginning (cont. var.)           0.185          0.184     
##                                                              
## donation_frequency (cont. var.)     0.009          0.008     
##                                                              
## health_very_good: 1 vs 0            < 0.001        < 0.001   
##                                                              
## health_excellent: 1 vs 0            < 0.001        < 0.001   
##                                                              
## Log-likelihood = -299.1957
## No. of observations = 589
## AIC value = 616.3914
tidy_logit_men <-
  logit_men %>% 
  tidy %>% 
  mutate(OR = exp(estimate),
         group = "Men")

Diagnostics

plot(logit_men)

vif(logit_men)
##                    age        starting_weight               BMI_diff 
##               1.102262               1.121451               1.021828 
## log_Ferritin_beginning           Hb_beginning     donation_frequency 
##               1.246534               1.140851               1.187989 
##       health_very_good       health_excellent 
##               2.075282               2.175625
reg_data<- list(men=regression_data_men,womenpost=regression_data_post_menop_women,womenpre=regression_data_pre_menop_women)
file <- "../results/hypregdata.rdata"
save(reg_data,file=file)

Bootstraps

# https://www.painblogr.org/2017-10-18-purrring-through-bootstraps.html


get_coefficients <- function(data, boot_ind){
  fit <- glm (health_outcome~.,
              data[boot_ind,],
              family = "binomial")
  return(coef(fit))
}



compute_Bca_CI <-function(fit_boot){
  Bca_inf = rep(0, length(fit_boot$t0))
  Bca_sup = rep(0, length(fit_boot$t0))
  for (i_regressor in 1:length(fit_boot$t0)){
    CI <- boot.ci(fit_boot, type = "bca", index=i_regressor)
    Bca_inf[i_regressor] <- CI$bca[4]
    Bca_sup[i_regressor] <- CI$bca[5]
  }
  return(tibble(Bca_inf,Bca_sup, regressor = names(fit_boot$t0)) %>% 
           filter(regressor != "(Intercept)"))
}


get_bootstrap <- function(data, nb_boot){
 
 

    fit_boot <- boot(data, statistic= get_coefficients, R = nb_boot)


  fit_boot_distrib <- as.tibble(fit_boot$t)  
  names(fit_boot_distrib) <- names(fit_boot$t0)

  fit_boot_Bca <- compute_Bca_CI(fit_boot)
  
  return(list(fit_boot_distrib,fit_boot_Bca))
}




get_bootstrap_coeffs <- function(regression_data, current_group, nb_boot )
{

  ## preprocess and standardize data

  
    test_data_std <- regression_data %>% 
      filter(group == current_group) %>% 
      dplyr::select(donor, age, starting_weight, BMI_diff,
                    log_Ferritin_beginning, Hb_beginning, 
                    donation_frequency) %>% 
      gather(key = key, value = value, -donor) %>% 
      group_by(key) %>% 
      mutate(value_scale = scale(value, scale = FALSE)[,1]) %>% 
      dplyr::select(-value) %>% 
      spread(key = key, value = value_scale) %>% 
      inner_join(regression_data %>% 
                   dplyr::select(donor, health_outcome, health_very_good, health_excellent),
                 by = "donor") %>% 
    dplyr::select(-donor) 

  
  
  result <- test_data_std %>% 
  get_bootstrap(nb_boot = nb_boot)


  bootstrap_distrib <- result[[1]] %>% 
    gather(key = regressor, value = coefficient) %>% 
    filter(regressor != "(Intercept)") %>% 
    mutate(group = current_group)
    
  Bca_CI  <- result[[2]] %>% mutate(group = current_group)

  return(list(bootstrap_distrib,Bca_CI))
}


output_file_distrib = ("../results/bootstraps/coeff_boot_distrib_seed_125_ctr_strata_final.rdata")
output_file_Bca = ("../results/bootstraps/coeff_boot_Bca_seed_125_ctr_strata_final.rdata")


#get_from_file_ferr = TRUE

get_from_file_ferr <- all(file.exists(output_file_distrib),file.exists(output_file_Bca) )

set.seed(125)
group_str = "Pre_menopause_women"

if (get_from_file_ferr){
  load(file=output_file_distrib)
  load(file=output_file_Bca)
}else
{
   for(group_str in c("Pre_menopause_women", "Post_menopause_women","Men")){
    
    stuff <- get_bootstrap_coeffs(regression_data, group_str, nb_boot = 10000)
    
    if(!exists("bootstrap_distrib")){
      bootstrap_distrib <- stuff[[1]]
      bootstrap_Bca_CI <- stuff[[2]]
    }else
    {
      bootstrap_distrib<-bind_rows(bootstrap_distrib, stuff[[1]])
      bootstrap_Bca_CI<-bind_rows(bootstrap_Bca_CI, stuff[[2]])
    }
  } 
  save(bootstrap_distrib,file=output_file_distrib) 
  save(bootstrap_Bca_CI,file=output_file_Bca)
}

Forest plots

# "\U0394 BMI"

regressor_values <- c(
 "age" = "Age",
   "BMI_diff" = "BMI difference",
   "donation_frequency" = "Donation frequency",
   "Hb_beginning" = "Initial Hemoglobin",
   "log_Ferritin_beginning" ="Initial Ferritin",
   "starting_weight" ="Initial weight",
   "health_very_good" ="Initial health rating \n Very good vs Satisfactory/Good",
    "health_excellent" = "Initial health rating \n Excellent vs Satisfactory/Good"
)
bootstrap_Bca_CI <-
  tidy_logit_men %>% 
  bind_rows(tidy_logit_post_menop_women) %>% 
  bind_rows(tidy_logit_pre_menop_women) %>% 
  dplyr::select(group, term, OR, p.value) %>% 
  rename(regressor = term) %>% 
  full_join(bootstrap_Bca_CI,
            by =  c("regressor", "group")) %>% 
  filter(regressor != "(Intercept)") %>% 
  mutate(Bca_inf = exp(Bca_inf),
         Bca_sup = exp(Bca_sup),
         is_sig = p.value < 0.05) %>% 
  mutate(regressor = plyr::revalue(regressor, regressor_values),
    regressor = ordered(regressor,
                    levels =  c("Age", "Initial weight", "BMI difference",  "Initial Hemoglobin", 
                                "Initial Ferritin","Donation frequency",
                               "Initial health rating \n Very good vs Satisfactory/Good",
                               "Initial health rating \n Excellent vs Satisfactory/Good")),
    regressor = fct_rev(regressor),
    group = case_when(group == "Pre_menopause_women" ~ "Pre-menopausal women",
      group == "Post_menopause_women" ~ "Post-menopausal women",
      TRUE ~ "Men"),
    group = ordered(group, levels = c(  "Pre-menopausal women",  "Post-menopausal women", "Men"  ) ))

file <- "../results/hyp_bootstrap_Bca_CI.csv"
temp <- bootstrap_Bca_CI %>% mutate_if(is.numeric, round, digits = 3) %>% dplyr::select(regressor,group,p.value,OR,Bca_inf,Bca_sup) %>% arrange(desc(regressor),group)
write.csv(temp ,file = file,row.names = FALSE)
# "\U0394 BMI"
bootstrap_distrib %>% 
  mutate(regressor = plyr::revalue(regressor, regressor_values),
  regressor = ordered(regressor,
                  levels =  c("Age", "Initial weight", "BMI difference",  "Initial Hemoglobin", 
                              "Initial Ferritin","Donation frequency",
                              "Initial health rating \n Very good vs Satisfactory/Good",
                              "Initial health rating \n Excellent vs Satisfactory/Good")),
  regressor = fct_rev(regressor),
  group = case_when(group == "Pre_menopause_women" ~ "Pre-menopausal women",
    group == "Post_menopause_women" ~ "Post-menopausal women",
    TRUE ~ "Men"),
  group = ordered(group, levels = c( "Men",  "Post-menopausal women", "Pre-menopausal women" ) )) %>% 
  mutate(OR = exp(coefficient)) %>% 
  ggplot(aes(x = regressor, y = OR, color = group, fill = group)) +
  geom_hline(yintercept = 1 , color = "grey", linetype = "dotted",
             size = 1) +
  # geom_violin(alpha = 0.15, trim = TRUE, position = position_dodge(width = 0.8), size = 0.1) +
  geom_pointrange(data = bootstrap_Bca_CI, 
                  aes(ymin=Bca_inf, ymax = Bca_sup, shape = is_sig, color = group, fill = group),
                  position = position_dodge(width = 0.7), size = 1) +
  # geom_point(data = bootstrap_Bca_CI, 
  #            aes(shape = is_sig, fill = group),
  #            position = position_dodge(width = 0.8), size = 2) +
    scale_color_manual(values = c(  "#0065BD", "#BA0303", "#C50084" ),
                     limits = c("Men",  "Post-menopausal women","Pre-menopausal women" )) +
     scale_fill_manual(values = c(  "#0065BD", "#BA0303", "#C50084" ),
                     limits = c("Men",  "Post-menopausal women","Pre-menopausal women" )) +
 scale_shape_manual(values = c(1,16)) +
  scale_y_log10(limits=c(1/40,40)) +
  # scale_x_discrete(labels=parse(text=unique(bootstrap_Bca_CI$regressor)))
  # ylim(1/35, 35) +
  guides(shape = FALSE) +
  theme_classic() +
  # theme(legend.position = c(0.8,0.9),
  #       legend.title = element_blank(),
  #       legend.box = "vertical",
  #       legend.text = element_text(size = 20),
  #       plot.title =  element_text(size = 24,
  #                                 hjust = 0),
  #       panel.grid.minor.y = element_blank(),
  #       axis.line = element_line(colour="black"),
  #       axis.title.y = element_blank(),
  #       axis.title.x = element_text(size = 24),
  #       axis.text = element_text(size = 22),
  #       panel.grid.major.y = element_blank()) +
  coord_flip() +
  xlab("Robust standardized coefficient") +
  labs(title = NULL,
       subtitle = NULL) +


   theme(axis.title = element_text(size = 14),
        strip.text =  element_text(size = 14),
        legend.text = element_text(size = 12),
        strip.background = element_rect(fill = "white"),
        strip.background.x = element_blank(),
        axis.text.y = element_text(size = 14),
        axis.text.x = element_text(size = 14, angle = 30,hjust = 0.8),
        plot.title =  element_text(size = 16),
        plot.subtitle = element_text(size = 16,
                                     hjust = 0.5))

   #
ggplot2::ggsave(filename = "../results/figures/hyp_log_reg_results_b.pdf",
       width = 35,
       height = 18,
       dpi = 600,
       units = "cm")

Separate plots for individual variables

fig.width=15
fig.height=20

Initial hemoglobin (g/l)

phbd <- final_data %>% 
  dplyr::select(donor, group, health, questionnaire) %>% 
  drop_na(group) %>% 
  filter( group != "Women_pre_menop_no_mens") %>% 
  spread(key = questionnaire, value = health) %>% 
  filter(First != "NA" & Second != "NA") %>%  
  mutate(Difference = case_when(First < Second ~ "Improved",
                                First == Second ~ "Stable",
                                TRUE ~ "Worsened")) %>% 
    mutate(Difference = ordered(Difference,levels=c("Worsened","Stable","Improved"))) %>% 
  inner_join(FD_activity_biomarker_data, by = c("donor", "group")) %>% 
  mutate(group = case_when(group == "Pre_menopause_women" ~ "Pre-menopausal \n women",
                           group == "Post_menopause_women" ~ "Post-menopausal \n women",
                           TRUE ~ "Men"),
         group = ordered(group, levels = c(  "Men", "Post-menopausal \n women", "Pre-menopausal \n women") ))

phb <-  ggplot(phbd,aes(x = Difference, y = Hb_v, color = group)) +
  # geom_hline(yintercept = 15, color = "red", linetype = 3) +
  geom_boxplot(outlier.alpha = 0) +
  geom_beeswarm(alpha = 0.5, shape = 1) +
  # scale_y_log10() +
  facet_grid(group ~.,scales="free_y") +
  scale_color_manual(values = c( "#ff85a2",  "#de2d26","#00BFFF" ),
                     limits = c("Pre-menopausal \n women", "Post-menopausal \n women","Men" )) +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("Evolution of self-rated health") +
  ylab("Initial hemoglobin (g/l)")  


ggplot2::ggsave(plot=phb,filename = "../results/figures/hyp_hb.pdf",
       width = fig.width,
       height = fig.height,
       dpi = 600,
       units = "cm")

phb

phbd %>% group_by(group, Difference) %>% summarise(mean=mean(Hb_v),median=median(Hb_v))
## # A tibble: 9 x 4
## # Groups:   group [3]
##   group                      Difference  mean median
##   <ord>                      <ord>      <dbl>  <dbl>
## 1 Men                        Worsened    151.    149
## 2 Men                        Stable      150.    149
## 3 Men                        Improved    150.    150
## 4 "Post-menopausal \n women" Worsened    138.    137
## 5 "Post-menopausal \n women" Stable      138.    139
## 6 "Post-menopausal \n women" Improved    136.    138
## 7 "Pre-menopausal \n women"  Worsened    136.    135
## 8 "Pre-menopausal \n women"  Stable      135.    135
## 9 "Pre-menopausal \n women"  Improved    135.    135
phbd %>% group_by(group) %>% summarise(mean=mean(Hb_v),median=median(Hb_v))
## # A tibble: 3 x 3
##   group                       mean median
##   <ord>                      <dbl>  <dbl>
## 1 Men                         150.    149
## 2 "Post-menopausal \n women"  138.    138
## 3 "Pre-menopausal \n women"   135.    135

Initial ferritin

pferd <- final_data %>% 
  dplyr::select(donor, group, health, questionnaire) %>% 
  drop_na(group) %>% 
  filter( group != "Women_pre_menop_no_mens") %>% 
  spread(key = questionnaire, value = health) %>% 
  filter(First != "NA" & Second != "NA") %>%  
  mutate(Difference = case_when(First < Second ~ "Improved",
                                First == Second ~ "Stable",
                                TRUE ~ "Worsened")) %>% 
    mutate(Difference = ordered(Difference,levels=c("Worsened","Stable","Improved"))) %>% 
  inner_join(FD_activity_biomarker_data, by = c("donor", "group")) %>% 
  mutate(group = case_when(group == "Pre_menopause_women" ~ "Pre-menopausal \n women",
                           group == "Post_menopause_women" ~ "Post-menopausal \n women",
                           TRUE ~ "Men"),
         group = ordered(group, levels = c(  "Men", "Post-menopausal \n women", "Pre-menopausal \n women") ))
  
pfer <-  ggplot(pferd, aes(x = Difference, y = Ferritin_beginning, color = group)) +
  # geom_hline(yintercept = 15, color = "red", linetype = 3) +
  geom_boxplot(outlier.alpha = 0) +
  geom_beeswarm(alpha = 0.5, shape = 1) +
   scale_y_log10() +
  facet_grid(group ~.,scales="free_y") +
  #facet_grid(group ~.) +
  scale_color_manual(values = c( "#ff85a2",  "#de2d26","#00BFFF" ),
                     limits = c("Pre-menopausal \n women", "Post-menopausal \n women","Men" )) +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("Evolution of self-rated health") +
  ylab("Initial ferritin (ug/l)") 
  #scale_y_log10(limits=c(10,100)) 
  
  


ggplot2::ggsave(plot=pfer,filename = "../results/figures/hyp_ferritin.pdf",
       width = fig.width,
       height = fig.height,
       dpi = 600,
       units = "cm")

pfer

pferd %>% group_by(group, Difference) %>% summarise(mean=mean(Ferritin_beginning),median=median(Ferritin_beginning),geoMean=exp(mean(log(Ferritin_beginning))))
## # A tibble: 9 x 5
## # Groups:   group [3]
##   group                      Difference  mean median geoMean
##   <ord>                      <ord>      <dbl>  <dbl>   <dbl>
## 1 Men                        Worsened    50.6   39      40.1
## 2 Men                        Stable      52.2   42      42.1
## 3 Men                        Improved    55.9   42.5    42.1
## 4 "Post-menopausal \n women" Worsened    42.1   33      34.5
## 5 "Post-menopausal \n women" Stable      40.7   33      33.0
## 6 "Post-menopausal \n women" Improved    40     33      32.4
## 7 "Pre-menopausal \n women"  Worsened    34.9   26      26.6
## 8 "Pre-menopausal \n women"  Stable      32.8   25      25.3
## 9 "Pre-menopausal \n women"  Improved    30.0   26      25.1
pferd %>% group_by(group) %>% summarise(mean=mean(Ferritin_beginning),median=median(Ferritin_beginning),geoMean=exp(mean(log(Ferritin_beginning))))
## # A tibble: 3 x 4
##   group                       mean median geoMean
##   <ord>                      <dbl>  <dbl>   <dbl>
## 1 Men                         52.2     41    41.5
## 2 "Post-menopausal \n women"  41.0     33    33.3
## 3 "Pre-menopausal \n women"   33.1     25    25.7

BMI difference

pbmd <- final_data %>% 
  dplyr::select(donor, group, health, questionnaire) %>% 
  drop_na(group) %>% 
  filter( group != "Women_pre_menop_no_mens") %>% 
  spread(key = questionnaire, value = health) %>% 
  filter(First != "NA" & Second != "NA") %>%  
  mutate(Difference = case_when(First < Second ~ "Improved",
                                First == Second ~ "Stable",
                                TRUE ~ "Worsened")) %>%
  mutate(Difference = ordered(Difference,levels=c("Worsened","Stable","Improved"))) %>% 
  inner_join(FD_activity_biomarker_data, by = c("donor", "group")) %>% 
  mutate(group = case_when(group == "Pre_menopause_women" ~ "Pre-menopausal \n women",
                           group == "Post_menopause_women" ~ "Post-menopausal \n women",
                           TRUE ~ "Men"),
         group = ordered(group, levels = c(  "Men", "Post-menopausal \n women", "Pre-menopausal \n women") ))

pbm <-  ggplot(pbmd, aes(x = Difference, y = BMI_diff, color = group)) +
  # geom_hline(yintercept = 15, color = "red", linetype = 3) +
  geom_boxplot(outlier.alpha = 0) +
  geom_beeswarm(alpha = 0.5, shape = 1) +
  #scale_y_log10() +
  facet_grid(group ~.,scales="free_y") +
  scale_color_manual(values = c( "#ff85a2",  "#de2d26","#00BFFF" ),
                     limits = c("Pre-menopausal \n women", "Post-menopausal \n women","Men" )) +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("Evolution of self-rated health") +
  ylab("BMI difference") 
  #ylim(-5,5)

ggplot2::ggsave(plot=pbm,filename = "../results/figures/hyp_bmi_diff.pdf",
       width = fig.width,
       height = fig.height,
       dpi = 600,
       units = "cm")

pbm

pbmd %>% group_by(group, Difference) %>% summarise(mean=mean(BMI_diff),median=median(BMI_diff))
## # A tibble: 9 x 4
## # Groups:   group [3]
##   group                      Difference   mean median
##   <ord>                      <ord>       <dbl>  <dbl>
## 1 Men                        Worsened   0.355   0.191
## 2 Men                        Stable     0.221   0    
## 3 Men                        Improved   0.0310  0    
## 4 "Post-menopausal \n women" Worsened   0.499   0.363
## 5 "Post-menopausal \n women" Stable     0.107   0    
## 6 "Post-menopausal \n women" Improved   0.591   0.367
## 7 "Pre-menopausal \n women"  Worsened   1.14    0.968
## 8 "Pre-menopausal \n women"  Stable     0.706   0.363
## 9 "Pre-menopausal \n women"  Improved   0.978   0.602
pbmd %>% group_by(group) %>% summarise(mean=mean(BMI_diff),median=median(BMI_diff))
## # A tibble: 3 x 3
##   group                       mean median
##   <ord>                      <dbl>  <dbl>
## 1 Men                        0.233  0    
## 2 "Post-menopausal \n women" 0.275  0.191
## 3 "Pre-menopausal \n women"  0.896  0.635

Donation frequency

pfqd <- final_data %>% 
  dplyr::select(donor, group, health, questionnaire) %>% 
  drop_na(group) %>% 
  filter( group != "Women_pre_menop_no_mens") %>% 
  spread(key = questionnaire, value = health) %>% 
  filter(First != "NA" & Second != "NA") %>%  
  mutate(Difference = case_when(First < Second ~ "Improved",
                                First == Second ~ "Stable",
                                TRUE ~ "Worsened")) %>% 
    mutate(Difference = ordered(Difference,levels=c("Worsened","Stable","Improved"))) %>% 
  inner_join(FD_activity_biomarker_data, by = c("donor", "group")) %>%
  mutate(donation_frequency = nb_donations_between_questionnaires/(interval_between_quest/12) ) %>% 
  mutate(group = case_when(group == "Pre_menopause_women" ~ "Pre-menopausal \n women",
                           group == "Post_menopause_women" ~ "Post-menopausal \n women",
                           TRUE ~ "Men"),
         group = ordered(group, levels = c(  "Men", "Post-menopausal \n women", "Pre-menopausal \n women") )) 
pfq <-  ggplot(pfqd, aes(x = Difference, y = donation_frequency, color = group)) +
  # geom_hline(yintercept = 15, color = "red", linetype = 3) +
  geom_boxplot(outlier.alpha = 0) +
  geom_beeswarm(alpha = 0.5, shape = 1) +
  #scale_y_log10() +
  facet_grid(group ~.,scales="free_y") +
  scale_color_manual(values = c( "#ff85a2",  "#de2d26","#00BFFF" ),
                     limits = c("Pre-menopausal \n women", "Post-menopausal \n women","Men" )) +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("Evolution of self-rated health") +
  ylab("Doonation frequency (yearly)")
  


ggplot2::ggsave(plot=pfq,filename = "../results/figures/hyp_n_donations.pdf",
       width = fig.width,
       height = fig.height,
       dpi = 600,
       units = "cm")

pfq

pfqd %>% group_by(group, Difference) %>% summarise(mean=mean(donation_frequency),median=median(donation_frequency))
## # A tibble: 9 x 4
## # Groups:   group [3]
##   group                      Difference  mean median
##   <ord>                      <ord>      <dbl>  <dbl>
## 1 Men                        Worsened    2.86   2.61
## 2 Men                        Stable      3.12   3.11
## 3 Men                        Improved    2.76   2.55
## 4 "Post-menopausal \n women" Worsened    2.22   2.07
## 5 "Post-menopausal \n women" Stable      2.46   2.5 
## 6 "Post-menopausal \n women" Improved    2.60   2.61
## 7 "Pre-menopausal \n women"  Worsened    1.86   1.85
## 8 "Pre-menopausal \n women"  Stable      1.91   1.88
## 9 "Pre-menopausal \n women"  Improved    2.05   2.18
pfqd %>% group_by(group) %>% summarise(mean=mean(donation_frequency),median=median(donation_frequency))
## # A tibble: 3 x 3
##   group                       mean median
##   <ord>                      <dbl>  <dbl>
## 1 Men                         3.00   2.91
## 2 "Post-menopausal \n women"  2.41   2.48
## 3 "Pre-menopausal \n women"   1.92   1.89

Inital weigth

pwed <- final_data %>% 
  dplyr::select(donor, group, health, questionnaire) %>% 
  drop_na(group) %>% 
  filter( group != "Women_pre_menop_no_mens") %>% 
  spread(key = questionnaire, value = health) %>% 
  filter(First != "NA" & Second != "NA") %>%  
  mutate(Difference = case_when(First < Second ~ "Improved",
                                First == Second ~ "Stable",
                                TRUE ~ "Worsened")) %>% 
    mutate(Difference = ordered(Difference,levels=c("Worsened","Stable","Improved"))) %>% 
  inner_join(FD_activity_biomarker_data, by = c("donor", "group")) %>% 
  mutate(group = case_when(group == "Pre_menopause_women" ~ "Pre-menopausal \n women",
                           group == "Post_menopause_women" ~ "Post-menopausal \n women",
                           TRUE ~ "Men"),
         group = ordered(group, levels = c(  "Men", "Post-menopausal \n women", "Pre-menopausal \n women") )) 
pwe <-  ggplot(pwed, aes(x = Difference, y = starting_weight, color = group)) +
  # geom_hline(yintercept = 15, color = "red", linetype = 3) +
  geom_boxplot(outlier.alpha = 0) +
  geom_beeswarm(alpha = 0.5, shape = 1) +
  scale_y_log10() +
  facet_grid(group ~.,scales="free_y") +
  scale_color_manual(values = c( "#ff85a2",  "#de2d26","#00BFFF" ),
                     limits = c("Pre-menopausal \n women", "Post-menopausal \n women","Men" )) +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("Evolution of self-rated health") +
  ylab("Initial weight (kg)") 
  
  


ggplot2::ggsave(plot=pwe,filename = "../results/figures/hyp_weight.pdf",
       width = fig.width,
       height = fig.height,
       dpi = 600,
       units = "cm")

pwe

pwed %>% group_by(group, Difference) %>% summarise(mean=mean(starting_weight),median=median(starting_weight))
## # A tibble: 9 x 4
## # Groups:   group [3]
##   group                      Difference  mean median
##   <ord>                      <ord>      <dbl>  <dbl>
## 1 Men                        Worsened    88.8   85  
## 2 Men                        Stable      84.2   83  
## 3 Men                        Improved    82.0   80  
## 4 "Post-menopausal \n women" Worsened    72.2   70  
## 5 "Post-menopausal \n women" Stable      71.0   69.5
## 6 "Post-menopausal \n women" Improved    69.2   68  
## 7 "Pre-menopausal \n women"  Worsened    72.1   69  
## 8 "Pre-menopausal \n women"  Stable      70.1   67  
## 9 "Pre-menopausal \n women"  Improved    69.8   65
pwed %>% group_by(group) %>% summarise(mean=mean(starting_weight),median=median(starting_weight))
## # A tibble: 3 x 3
##   group                       mean median
##   <ord>                      <dbl>  <dbl>
## 1 Men                         85.2     83
## 2 "Post-menopausal \n women"  71.1     70
## 3 "Pre-menopausal \n women"   70.7     67